About the data: The Civil Rights Data Collection (CRDC) 2015-16 includes data from a survey of public school districts in the United States for school year 2015-16. It is submitted biannually to the US Department of Education Office of Civil Rights. Data includes school characteristics, services, and outcomes, most of which are disaggregated by race, ethnicity, gender, limited English proficiency status, and disability. Each school district submits data from the relevant year and certifies that it is correct. The public-use data is suppressed to protect confidentiality. Data can be downloaded here: https://ocrdata.ed.gov/DownloadDataFile. Data Definitions: https://ocrdata.ed.gov/DataDefinitions Data Notes: https://ocrdata.ed.gov/DataNotes User Manual: https://ocrdata.ed.gov/DataFileUsersManual

We are particularly interested in exploring variables related to discipline in schools, including in-school suspensions, absenteeism, and students reporting harassment or bullying on the basis of race and sex. Each of these variables is reported in the CRDC disaggregated by race, ethnicity, sex, and disability.

We tested whether there are significant differences in these indicators by subgroup using ANOVA followed by t-tests. For example, is there a difference in the rates of in-school suspensions by ethnicity?

We also see if the reported variables are correlated with various school characteristics, including student/teacher ratio, teacher absenteeism, and school expenditures per student.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(naniar)
## Warning: package 'naniar' was built under R version 3.6.1
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.1
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(corrplot)
## corrplot 0.84 loaded

Import dataset and subset to PA and relevant columns:

schoolData <- read.csv(file = "CRDC 2015-16 School Data.csv")
PA <- subset(schoolData, schoolData$LEA_STATE == "PA")
PA$KEY <- paste(PA$LEAID, PA$SCHID)  ## create a primary key based on school district and campus codes
dim(PA)
## [1] 3027 1837
## Cleaning the data includes removing indicators for NULL values:
## -7 indicates data that should have been reported but was not
## -5 indicates a district was unable to provide all required data but will in the future.
## -9 indicates a data element is not applicable (eg no students of a particular ethnicity). Some rows that should be -9 have a 0 because the skip logic did not always work.
## -2 indicates a small value due to confidentiality.

PA <- replace_with_na_all(data = PA, condition = ~.x %in% c(-9, -7, -5, -2)) 
## Subset PA to remove outliers in several columns:

## Subset PA to rows where total FTE of teachers is greater than one:
PA <- subset(PA, SCH_FTETEACH_TOT > 1)

## Subset PA to rows where total expenditures is not greater than 100,000,000 
## (There are a few extreme outliers):
PA <- subset(PA, SCH_SAL_TOTPERS_WFED < 100000000)
PA <- subset(PA, SCH_NPE_WFED < 100000000)

## Create fields for school characteristics:

PA$SAL_PER_TEACH <- with(PA, SCH_SAL_TEACH_WOFED / SCH_FTE_TEACH_WOFED)

PA$RATE_LEP <- with(PA, ifelse((TOT_ENR_M + TOT_ENR_F)>(SCH_ENR_LEP_M + SCH_ENR_LEP_F),
                               (SCH_ENR_LEP_M + SCH_ENR_LEP_F)/(TOT_ENR_M + TOT_ENR_F), NA))

PA$STU_TEA_RATIO <- with(PA, (TOT_ENR_M + TOT_ENR_F) / SCH_FTETEACH_TOT)

PA$TEA_ABSENT <- with(PA, ifelse(SCH_FTETEACH_TOT>SCH_FTETEACH_ABSENT, 
                                 SCH_FTETEACH_ABSENT / SCH_FTETEACH_TOT, NA))

PA$EXP_STU <- with(PA, (SCH_SAL_TOTPERS_WFED + SCH_NPE_WFED) / (TOT_ENR_M + TOT_ENR_F))

## Create fields for overall rates:
PA$RATE_ABSENT <- with(PA, ifelse((TOT_ENR_M + TOT_ENR_F)>=(TOT_ABSENT_M + TOT_ABSENT_F),
                                  (TOT_ABSENT_M + TOT_ABSENT_F)/(TOT_ENR_M + TOT_ENR_F), NA))

PA$RATE_ISS <- with(PA, ifelse((TOT_ENR_M + TOT_ENR_F)>=(TOT_DISCWODIS_ISS_M + TOT_DISCWODIS_ISS_F),
                               (TOT_DISCWODIS_ISS_M + TOT_DISCWODIS_ISS_F)/(TOT_ENR_M + TOT_ENR_F), NA))

PA$RATE_BULLY <- with(PA, ifelse((TOT_ENR_M + TOT_ENR_F)>=(TOT_HBREPORTED_RAC_M + TOT_HBREPORTED_RAC_F),
                                 (TOT_HBREPORTED_RAC_M + TOT_HBREPORTED_RAC_F)/(TOT_ENR_M + TOT_ENR_F), NA))




## Create data frame with school characteristics and overall rates:
PA_compare <- subset(PA, select = 
                       c(SAL_PER_TEACH, RATE_LEP, STU_TEA_RATIO, TEA_ABSENT, EXP_STU,
                         RATE_ABSENT, RATE_ISS, RATE_BULLY))

## Summary of relevant fields from PA:
sapply(PA_compare, summary)
## $SAL_PER_TEACH
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0   55087   62987   64561   71518  245800      10 
## 
## $RATE_LEP
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.008163 0.028461 0.027923 0.475410 
## 
## $STU_TEA_RATIO
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.60   12.35   13.98   14.32   15.75  144.80 
## 
## $TEA_ABSENT
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.1878  0.3373  0.3402  0.4762  0.9885      11 
## 
## $EXP_STU
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    4444    5979    6929    7954  105726 
## 
## $RATE_ABSENT
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0561  0.1163  0.1581  0.2142  0.9658      10 
## 
## $RATE_ISS
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## 0.000000 0.000000 0.003436 0.023122 0.023095 0.612903        7 
## 
## $RATE_BULLY
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## 0.0000000 0.0000000 0.0000000 0.0006494 0.0000000 0.0424837         2
## create correlation matrix for listed variables, using only complete observations without NAs
corr_matrix <- cor(PA_compare, use = "complete.obs")

## create a visualization of the correlation matrix
corrplot(corr_matrix, tl.col = "black", method = "color", order = "hclust", addCoef.col = "black",
         sig.level = 0.01, insig = "blank",type = "upper", diag = FALSE)

## create an R squared correlation matrix:
corr_matrix_2 <- corr_matrix ^ 2

corrplot(corr_matrix_2, tl.col = "black", method = "color", order = "hclust", addCoef.col = "black",
         sig.level = 0.01, insig = "blank",type = "upper", diag = FALSE)

## None of the school characteristics appear to be very correlated with the overall rates 
## of ISS, absenteeism, or reported bullying. The highest correlation is between 
## student/teacher ratio and expenditures per student, followed by the rate of 
## limited English proficient students and the overall rate of absenteeism.

## heat map version of the correlation matrix to make it compatible with ploty output
vals <- unique(scales::rescale(c(corr_matrix)))
o <- order(vals, decreasing = FALSE)
cols <- scales::col_numeric("Reds", domain = NULL)(vals)
colz <- setNames(data.frame(vals[o], cols[o]), NULL)

trial <- plot_ly( x = c("SAL_PER_TEACH","STU_TEA_RATIO","TEA_ABSENT", "RATE_LEP", "RATE_ABSENT", "EXP_STU", "RATE_ISS", "RATE_BULLY"),y = c("SAL_PER_TEACH","STU_TEA_RATIO","TEA_ABSENT", "RATE_LEP", "RATE_ABSENT", "EXP_STU", "RATE_ISS", "RATE_BULLY"),z = corr_matrix, type = "heatmap",colorscale = colz)
trial

Calculate rates by ethnicity and gender for reported bullying, absenteeism, and ISS:

## Create rates of each indicator by ethnicity and gender, 
## where there are more than 5 students in each subgroup and 
##the number enrolled is greater than the numerator 
## (to eliminate misreported data with rates over 100%):

## Enrollment by ethnicity
PA <- PA %>% mutate(SCH_ENR_HI = SCH_ENR_HI_M + SCH_ENR_HI_F)
PA <- PA %>% mutate(SCH_ENR_BL = SCH_ENR_BL_M + SCH_ENR_BL_F)
PA <- PA %>% mutate(SCH_ENR_WH = SCH_ENR_WH_M + SCH_ENR_WH_F)
PA <- PA %>% mutate(SCH_ENR_AS = SCH_ENR_AS_M + SCH_ENR_AS_F)



## Bullying rates
PA <- PA %>% mutate(SCH_BULLY_HI = SCH_HBREPORTED_RAC_HI_M + SCH_HBREPORTED_RAC_HI_F)
PA <- PA %>% mutate(SCH_BULLY_BL = SCH_HBREPORTED_RAC_BL_M + SCH_HBREPORTED_RAC_BL_F)
PA <- PA %>% mutate(SCH_BULLY_WH = SCH_HBREPORTED_RAC_WH_M + SCH_HBREPORTED_RAC_WH_F)
PA <- PA %>% mutate(SCH_BULLY_AS = SCH_HBREPORTED_RAC_AS_M + SCH_HBREPORTED_RAC_AS_F)

PA <- PA %>% mutate(RATE_BULLY_HI = ifelse(SCH_ENR_HI>=SCH_BULLY_HI & SCH_ENR_HI>=5, 
                                                 SCH_BULLY_HI/SCH_ENR_HI, NA))
PA <- PA %>% mutate(RATE_BULLY_BL = ifelse(SCH_ENR_BL>=SCH_BULLY_BL & SCH_ENR_BL>=5, 
                                                 SCH_BULLY_BL/SCH_ENR_BL, NA))
PA <- PA %>% mutate(RATE_BULLY_WH = ifelse(SCH_ENR_WH>=SCH_BULLY_WH & SCH_ENR_WH>=5, 
                                                 SCH_BULLY_WH/SCH_ENR_WH, NA))
PA <- PA %>% mutate(RATE_BULLY_AS = ifelse(SCH_ENR_AS>=SCH_BULLY_AS & SCH_ENR_AS>=5, 
                                                 SCH_BULLY_AS/SCH_ENR_AS, NA))
PA <- PA %>% mutate(RATE_BULLY_M = ifelse(TOT_ENR_M>=TOT_HBREPORTED_SEX_M & TOT_ENR_M>=5, 
                                                 TOT_HBREPORTED_SEX_M/TOT_ENR_M, NA))
PA <- PA %>% mutate(RATE_BULLY_F = ifelse(TOT_ENR_F>TOT_HBREPORTED_SEX_F & TOT_ENR_F>5, 
                                                 TOT_HBREPORTED_SEX_F/TOT_ENR_F, NA))

## ISS rates
PA <- PA %>% mutate(SCH_ISS_HI = SCH_DISCWODIS_ISS_HI_M + SCH_DISCWODIS_ISS_HI_F)
PA <- PA %>% mutate(SCH_ISS_BL = SCH_DISCWODIS_ISS_BL_M + SCH_DISCWODIS_ISS_BL_F)
PA <- PA %>% mutate(SCH_ISS_WH = SCH_DISCWODIS_ISS_WH_M + SCH_DISCWODIS_ISS_WH_F)
PA <- PA %>% mutate(SCH_ISS_AS = SCH_DISCWODIS_ISS_AS_M + SCH_DISCWODIS_ISS_AS_F)

PA <- PA %>% mutate(RATE_ISS_HI = ifelse(SCH_ENR_HI>=SCH_ISS_HI & SCH_ENR_HI>=5, 
                                                 SCH_ISS_HI/SCH_ENR_HI, NA))
PA <- PA %>% mutate(RATE_ISS_BL = ifelse(SCH_ENR_BL>=SCH_ISS_BL & SCH_ENR_BL>=5, 
                                                 SCH_ISS_BL/SCH_ENR_BL, NA))
PA <- PA %>% mutate(RATE_ISS_WH = ifelse(SCH_ENR_WH>=SCH_ISS_WH & SCH_ENR_WH>=5, 
                                                 SCH_ISS_WH/SCH_ENR_WH, NA))
PA <- PA %>% mutate(RATE_ISS_AS = ifelse(SCH_ENR_AS>=SCH_ISS_AS & SCH_ENR_AS>=5, 
                                                 SCH_ISS_AS/SCH_ENR_AS, NA))
PA <- PA %>% mutate(RATE_ISS_M = ifelse(TOT_ENR_M>=TOT_DISCWODIS_ISS_M & TOT_ENR_M>=5, 
                                                 TOT_DISCWODIS_ISS_M/TOT_ENR_M, NA))
PA <- PA %>% mutate(RATE_ISS_F = ifelse(TOT_ENR_F>TOT_DISCWODIS_ISS_F & TOT_ENR_F>5, 
                                                 TOT_DISCWODIS_ISS_F/TOT_ENR_F, NA))
## Absent rates
PA <- PA %>% mutate(SCH_ABSENT_HI = SCH_ABSENT_HI_M + SCH_ABSENT_HI_F)
PA <- PA %>% mutate(SCH_ABSENT_BL = SCH_ABSENT_BL_M + SCH_ABSENT_BL_F)
PA <- PA %>% mutate(SCH_ABSENT_WH = SCH_ABSENT_WH_M + SCH_ABSENT_WH_F)
PA <- PA %>% mutate(SCH_ABSENT_AS = SCH_ABSENT_AS_M + SCH_ABSENT_AS_F)

PA <- PA %>% mutate(RATE_ABSENT_HI = ifelse(SCH_ENR_HI>=SCH_ABSENT_HI & SCH_ENR_HI>=5, 
                                            SCH_ABSENT_HI/SCH_ENR_HI, NA))
PA <- PA %>% mutate(RATE_ABSENT_BL = ifelse(SCH_ENR_BL>=SCH_ABSENT_BL & SCH_ENR_BL>=5, 
                                            SCH_ABSENT_BL/SCH_ENR_BL, NA))
PA <- PA %>% mutate(RATE_ABSENT_WH = ifelse(SCH_ENR_WH>=SCH_ABSENT_WH & SCH_ENR_WH>=5, 
                                            SCH_ABSENT_WH/SCH_ENR_WH, NA))
PA <- PA %>% mutate(RATE_ABSENT_AS = ifelse(SCH_ENR_AS>=SCH_ABSENT_AS & SCH_ENR_AS>=5, 
                                            SCH_ABSENT_AS/SCH_ENR_AS, NA))
PA <- PA %>% mutate(RATE_ABSENT_M = ifelse(TOT_ENR_M>=TOT_ABSENT_M & TOT_ENR_M>=5, 
                                            TOT_ABSENT_M/TOT_ENR_M, NA))
PA <- PA %>% mutate(RATE_ABSENT_F = ifelse(TOT_ENR_F>=TOT_ABSENT_F & TOT_ENR_F>=5, 
                                            TOT_ABSENT_F/TOT_ENR_F, NA))

PA_bully_rates <- subset(PA, select = c(KEY, RATE_BULLY_HI, RATE_BULLY_BL, RATE_BULLY_WH, RATE_BULLY_AS,
                                              RATE_BULLY_M, RATE_BULLY_F))
PA_iss_rates <- subset(PA, select = c(KEY, RATE_ISS_HI, RATE_ISS_BL, RATE_ISS_WH, RATE_ISS_AS,
                                              RATE_ISS_M, RATE_ISS_F))
PA_abs_rates <- subset(PA, select = c(KEY, RATE_ABSENT_HI, RATE_ABSENT_BL, RATE_ABSENT_WH, RATE_ABSENT_AS,
                                              RATE_ABSENT_M, RATE_ABSENT_F))

## New data frame with sums of each ethnicity to give an insight into the data : 
enroll <- subset(PA, select = c(SCH_ENR_HI, SCH_ENR_BL, SCH_ENR_WH, SCH_ENR_AS, TOT_ENR_M, TOT_ENR_F))
enroll_sums <- sapply(enroll, sum)


## Keep rows with fewer than 3 NAs for each indicator (at least 2 ethnicities to compare):
PA_bully_rates <- PA_bully_rates[rowSums(is.na(PA_bully_rates)) < 3, ]
PA_iss_rates <- PA_iss_rates[rowSums(is.na(PA_iss_rates)) < 3, ]
PA_abs_rates <- PA_abs_rates[rowSums(is.na(PA_abs_rates)) < 3, ]

## combine rates into one data frame
PA_rates <- merge(merge(PA_bully_rates, PA_iss_rates, by="KEY", all.x=TRUE, all.Y=TRUE), 
                  PA_abs_rates, by="KEY", all.x=TRUE, all.y=TRUE)

Show distribution of data, ANOVA, and t-tests:

## Show 10 rows of each data frame:
PA_bully_rates[sample(nrow(PA_bully_rates), 10),]
## # A tibble: 10 x 7
##    KEY   RATE_BULLY_HI RATE_BULLY_BL RATE_BULLY_WH RATE_BULLY_AS
##    <chr>         <dbl>         <dbl>         <dbl>         <dbl>
##  1 4218~             0             0            NA            NA
##  2 4215~             0             0             0            NA
##  3 4218~             0             0             0             0
##  4 4212~             0             0             0             0
##  5 4207~             0             0             0             0
##  6 4210~             0             0             0             0
##  7 4209~            NA             0             0            NA
##  8 4219~             0             0             0            NA
##  9 4220~             0             0             0             0
## 10 4200~             0             0             0            NA
## # ... with 2 more variables: RATE_BULLY_M <dbl>, RATE_BULLY_F <dbl>
PA_iss_rates[sample(nrow(PA_iss_rates), 10),]
## # A tibble: 10 x 7
##    KEY   RATE_ISS_HI RATE_ISS_BL RATE_ISS_WH RATE_ISS_AS RATE_ISS_M
##    <chr>       <dbl>       <dbl>       <dbl>       <dbl>      <dbl>
##  1 4202~      0          0           0           NA         0      
##  2 4226~      0          0           0            0         0      
##  3 4200~      0          0.00550     0            0         0.00249
##  4 4226~      0          0           0            0         0      
##  5 4205~      0          0.05        0.00127      0         0.00217
##  6 4219~      0          0           0.00144      0         0.00232
##  7 4207~      0          0           0           NA         0      
##  8 4219~     NA          0           0           NA         0      
##  9 4202~      0          0           0.00333      0         0.00443
## 10 4209~      0.0493     0.154       0.0676       0.0909    0.126  
## # ... with 1 more variable: RATE_ISS_F <dbl>
PA_abs_rates[sample(nrow(PA_abs_rates), 10),]
## # A tibble: 10 x 7
##    KEY   RATE_ABSENT_HI RATE_ABSENT_BL RATE_ABSENT_WH RATE_ABSENT_AS
##    <chr>          <dbl>          <dbl>          <dbl>          <dbl>
##  1 4220~         0.4             0.08          0.137          0.286 
##  2 4218~         0.303           0.390        NA              0     
##  3 4218~         0.4             0.224         0.4           NA     
##  4 4218~         0.104           0.145         0.0693         0.0752
##  5 4224~         0.571           0.286         0.215         NA     
##  6 4224~        NA               0.571         0.188         NA     
##  7 4215~         0.345           0.368         0.242         NA     
##  8 4200~        NA               0.516         0.594         NA     
##  9 4200~         0.324           0.354         0.196          0.157 
## 10 4225~         0.0625          0             0.0710         0     
## # ... with 2 more variables: RATE_ABSENT_M <dbl>, RATE_ABSENT_F <dbl>
## Gather each into longer datasets for ANOVA and boxplots:
PA_bully_rates_long <- PA_bully_rates %>% gather(key="subgroup", value="rate", -KEY)
PA_iss_rates_long <- PA_iss_rates %>% gather(key="subgroup", value="rate", -KEY)
PA_abs_rates_long <- PA_abs_rates %>% gather(key="subgroup", value="rate", -KEY)

## Show boxplots of distributions
boxplot(rate ~ subgroup, data = PA_bully_rates_long)

boxplot(rate ~ subgroup, data = PA_iss_rates_long)

boxplot(rate ~ subgroup, data = PA_abs_rates_long)

## ANOVA
summary(aov(rate ~ subgroup, data = PA_bully_rates_long))
##                Df Sum Sq Mean Sq F value Pr(>F)    
## subgroup        5  0.075 0.01505    50.1 <2e-16 ***
## Residuals   13586  4.082 0.00030                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1870 observations deleted due to missingness
summary(aov(rate ~ subgroup, data = PA_iss_rates_long))
##                Df Sum Sq Mean Sq F value Pr(>F)    
## subgroup        5   2.54  0.5074     114 <2e-16 ***
## Residuals   13578  60.45  0.0045                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1866 observations deleted due to missingness
summary(aov(rate ~ subgroup, data = PA_abs_rates_long))
##                Df Sum Sq Mean Sq F value Pr(>F)    
## subgroup        5   13.5  2.6901   97.32 <2e-16 ***
## Residuals   13552  374.6  0.0276                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1856 observations deleted due to missingness
## ANOVA suggests that at lease one subgroup is significantly different.

## Show summary of each indicator as well as the Shapiro-Wilk normality test.
sapply(PA_rates, summary)
## $KEY
##    Length     Class      Mode 
##      2577 character character 
## 
## $RATE_BULLY_HI
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.0016  0.0000  0.4000     349 
## 
## $RATE_BULLY_BL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.0072  0.0000  0.4000     359 
## 
## $RATE_BULLY_WH
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00000 0.00000 0.00000 0.00046 0.00000 0.28571      89 
## 
## $RATE_BULLY_AS
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.0008  0.0000  0.2857    1064 
## 
## $RATE_BULLY_M
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## 0.0000000 0.0000000 0.0000000 0.0005549 0.0000000 0.1944444         1 
## 
## $RATE_BULLY_F
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## 0.000000 0.000000 0.000000 0.001156 0.000000 0.099174        8 
## 
## $RATE_ISS_HI
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.0301  0.0095  1.0000     351 
## 
## $RATE_ISS_BL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.0547  0.0645  0.7568     361 
## 
## $RATE_ISS_WH
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00000 0.00000 0.00000 0.01922 0.01884 0.63415      91 
## 
## $RATE_ISS_AS
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.0111  0.0000  0.5714    1064 
## 
## $RATE_ISS_M
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## 0.000000 0.000000 0.004854 0.030583 0.034675 0.634146        3 
## 
## $RATE_ISS_F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00000 0.00000 0.00000 0.01694 0.01224 0.65854       8 
## 
## $RATE_ABSENT_HI
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0297  0.1523  0.1894  0.2857  1.0000     351 
## 
## $RATE_ABSENT_BL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0502  0.1693  0.2075  0.3077  1.0000     366 
## 
## $RATE_ABSENT_WH
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00000 0.05013 0.10526 0.15368 0.20505 1.00000      98 
## 
## $RATE_ABSENT_AS
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0455  0.0939  0.1429  1.0000    1064 
## 
## $RATE_ABSENT_M
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00000 0.05694 0.11283 0.15676 0.21285 0.95937       9 
## 
## $RATE_ABSENT_F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00000 0.05369 0.11675 0.16183 0.22297 1.00000      16
sapply(PA_rates[,-1], shapiro.test)
##           RATE_BULLY_HI                 RATE_BULLY_BL                
## statistic 0.0723096                     0.1966283                    
## p.value   8.401501e-74                  7.957537e-71                 
## method    "Shapiro-Wilk normality test" "Shapiro-Wilk normality test"
## data.name "X[[i]]"                      "X[[i]]"                     
##           RATE_BULLY_WH                 RATE_BULLY_AS                
## statistic 0.04181355                    0.04182463                   
## p.value   4.359223e-77                  1.21565e-65                  
## method    "Shapiro-Wilk normality test" "Shapiro-Wilk normality test"
## data.name "X[[i]]"                      "X[[i]]"                     
##           RATE_BULLY_M                  RATE_BULLY_F                 
## statistic 0.0889887                     0.2359587                    
## p.value   7.070669e-77                  3.248394e-73                 
## method    "Shapiro-Wilk normality test" "Shapiro-Wilk normality test"
## data.name "X[[i]]"                      "X[[i]]"                     
##           RATE_ISS_HI                   RATE_ISS_BL                  
## statistic 0.4611547                     0.5943165                    
## p.value   3.06821e-63                   5.680843e-58                 
## method    "Shapiro-Wilk normality test" "Shapiro-Wilk normality test"
## data.name "X[[i]]"                      "X[[i]]"                     
##           RATE_ISS_WH                   RATE_ISS_AS                  
## statistic 0.488022                      0.2823524                    
## p.value   1.196979e-64                  2.416934e-60                 
## method    "Shapiro-Wilk normality test" "Shapiro-Wilk normality test"
## data.name "X[[i]]"                      "X[[i]]"                     
##           RATE_ISS_M                    RATE_ISS_F                   
## statistic 0.5557307                     0.3858765                    
## p.value   1.000429e-62                  7.152518e-69                 
## method    "Shapiro-Wilk normality test" "Shapiro-Wilk normality test"
## data.name "X[[i]]"                      "X[[i]]"                     
##           RATE_ABSENT_HI                RATE_ABSENT_BL               
## statistic 0.869635                      0.8964269                    
## p.value   1.242625e-39                  3.086279e-36                 
## method    "Shapiro-Wilk normality test" "Shapiro-Wilk normality test"
## data.name "X[[i]]"                      "X[[i]]"                     
##           RATE_ABSENT_WH                RATE_ABSENT_AS               
## statistic 0.7993502                     0.7408348                    
## p.value   4.047324e-48                  1.689564e-43                 
## method    "Shapiro-Wilk normality test" "Shapiro-Wilk normality test"
## data.name "X[[i]]"                      "X[[i]]"                     
##           RATE_ABSENT_M                 RATE_ABSENT_F                
## statistic 0.8359583                     0.8287372                    
## p.value   1.630579e-45                  3.772165e-46                 
## method    "Shapiro-Wilk normality test" "Shapiro-Wilk normality test"
## data.name "X[[i]]"                      "X[[i]]"
## Even though data are non-normal, T-tests are accurate if the sample size is large enough 
## (there are over 2500 observations).

## Individual t-test p-values:

cat("The rates of males and females reporting bullying on the basis of sex are significantly different with a p-value of", round(t.test(PA_rates$RATE_BULLY_M, PA_rates$RATE_BULLY_F)$p.value, 8))
## The rates of males and females reporting bullying on the basis of sex are significantly different with a p-value of 8.7e-06
cat("The rates of Hispanics and Whites reporting bullying on the basis of race are significantly different with a p-value of", round(t.test(PA_rates$RATE_BULLY_HI, PA_rates$RATE_BULLY_WH)$p.value, 8))
## The rates of Hispanics and Whites reporting bullying on the basis of race are significantly different with a p-value of 0.00187247
                                                                                                                    cat("The rates of Hispanics and Blacks reporting bullying on the basis of race are significantly different with a p-value of", round(t.test(PA_rates$RATE_BULLY_HI, PA_rates$RATE_BULLY_BL)$p.value, 8))
## The rates of Hispanics and Blacks reporting bullying on the basis of race are significantly different with a p-value of 0
cat("The rates of Hispanics and Asians reporting bullying on the basis of race are not significantly different with a p-value of", round(t.test(PA_rates$RATE_BULLY_HI, PA_rates$RATE_BULLY_AS)$p.value, 8))
## The rates of Hispanics and Asians reporting bullying on the basis of race are not significantly different with a p-value of 0.05946567
cat("The rates of Blacks and Whites reporting bullying on the basis of race are significantly different with a p-value of", round(t.test(PA_rates$RATE_BULLY_BL, PA_rates$RATE_BULLY_WH)$p.value, 8))
## The rates of Blacks and Whites reporting bullying on the basis of race are significantly different with a p-value of 0
cat("The rates of Blacks and Asians reporting bullying on the basis of race are significantly different with a p-value of", round(t.test(PA_rates$RATE_BULLY_BL, PA_rates$RATE_BULLY_AS)$p.value, 8))
## The rates of Blacks and Asians reporting bullying on the basis of race are significantly different with a p-value of 0
cat("The rates of Asians and Whites reporting bullying on the basis of race are not significantly different with a p-value of", round(t.test(PA_rates$RATE_BULLY_AS, PA_rates$RATE_BULLY_WH)$p.value, 8))
## The rates of Asians and Whites reporting bullying on the basis of race are not significantly different with a p-value of 0.3290349
cat("The rates of males and females with in-school suspensions are significantly different with a p-value of", round(t.test(PA_rates$RATE_ISS_M, PA_rates$RATE_ISS_F)$p.value, 8))
## The rates of males and females with in-school suspensions are significantly different with a p-value of 0
cat("The rates of Hispanics and Whites with in-school suspensions are significantly different with a p-value of", round(t.test(PA_rates$RATE_ISS_HI, PA_rates$RATE_ISS_WH)$p.value, 8))
## The rates of Hispanics and Whites with in-school suspensions are significantly different with a p-value of 0
cat("The rates of Hispanics and Blacks with in-school suspensions are significantly different with a p-value of", round(t.test(PA_rates$RATE_ISS_HI, PA_rates$RATE_ISS_BL)$p.value, 8))
## The rates of Hispanics and Blacks with in-school suspensions are significantly different with a p-value of 0
cat("The rates of Hispanics and Asians with in-school suspensions are significantly different with a p-value of", round(t.test(PA_rates$RATE_ISS_HI, PA_rates$RATE_ISS_AS)$p.value, 8))
## The rates of Hispanics and Asians with in-school suspensions are significantly different with a p-value of 0
cat("The rates of Blacks and Whites with in-school suspensions are significantly different with a p-value of", round(t.test(PA_rates$RATE_ISS_BL, PA_rates$RATE_ISS_WH)$p.value, 8))
## The rates of Blacks and Whites with in-school suspensions are significantly different with a p-value of 0
cat("The rates of Blacks and Asians with in-school suspensions are significantly different with a p-value of", round(t.test(PA_rates$RATE_ISS_BL, PA_rates$RATE_ISS_AS)$p.value, 8))
## The rates of Blacks and Asians with in-school suspensions are significantly different with a p-value of 0
cat("The rates of Asians and Whites with in-school suspensions are significantly different with a p-value of", round(t.test(PA_rates$RATE_ISS_AS, PA_rates$RATE_ISS_WH)$p.value, 8))
## The rates of Asians and Whites with in-school suspensions are significantly different with a p-value of 1e-08
cat("The rates of males and females who are chronically absent are not significantly different with a p-value of", round(t.test(PA_rates$RATE_ABSENT_M, PA_rates$RATE_ABSENT_F)$p.value, 8))
## The rates of males and females who are chronically absent are not significantly different with a p-value of 0.2407765
cat("The rates of Hispanics and Whites who are chronically absent are significantly different with a p-value of", round(t.test(PA_rates$RATE_ABSENT_HI, PA_rates$RATE_ABSENT_WH)$p.value, 8))
## The rates of Hispanics and Whites who are chronically absent are significantly different with a p-value of 0
cat("The rates of Hispanics and Blacks who are chronically absent are significantly different with a p-value of", round(t.test(PA_rates$RATE_ABSENT_HI, PA_rates$RATE_ABSENT_BL)$p.value, 8))
## The rates of Hispanics and Blacks who are chronically absent are significantly different with a p-value of 0.00163911
cat("The rates of Hispanics and Asians who are chronically absent are significantly different with a p-value of", round(t.test(PA_rates$RATE_ABSENT_HI, PA_rates$RATE_ABSENT_AS)$p.value, 8))
## The rates of Hispanics and Asians who are chronically absent are significantly different with a p-value of 0
cat("The rates of Blacks and Whites who are chronically absent are significantly different with a p-value of", round(t.test(PA_rates$RATE_ABSENT_BL, PA_rates$RATE_ABSENT_WH)$p.value, 8))
## The rates of Blacks and Whites who are chronically absent are significantly different with a p-value of 0
cat("The rates of Blacks and Asians who are chronically absent are significantly different with a p-value of", round(t.test(PA_rates$RATE_ABSENT_BL, PA_rates$RATE_ABSENT_AS)$p.value, 8))
## The rates of Blacks and Asians who are chronically absent are significantly different with a p-value of 0
cat("The rates of Asians and Whites who are chronically absent are significantly different with a p-value of", round(t.test(PA_rates$RATE_ABSENT_AS, PA_rates$RATE_ABSENT_WH)$p.value, 8))
## The rates of Asians and Whites who are chronically absent are significantly different with a p-value of 0

The following text is included as a glossary page on the app:

cat("RATE_BULLY: For ethnicities, the number of students reported as harassed or bullied on the basis of race, color or national origin divided by the total number of enrolled students. For genders, the number of students reported as harassed or bullied on the basis of sex divided by the total number of enrolled students.", "RATE_ISS: The number of students without disabilities who received one or more in-school suspensions divided by the total number of enrolled students.", "RATE_ABSENT: The number of students absent 15 or more days in the school year divided by the total number of enrolled students.", "RATE_LEP: The number of students who have limited English proficiency divided by the total number of enrolled students.", "SAL_PER_TEACH: Total salary expenditures for all teachers divided by the total FTE of teachers.", "TEA_ABSENT: Number of FTE teachers who were absent more than 10 school days during the school year divided by the number of FTE teachers.", "STU_TEA_RATIO: Total number of enrolled students divided by the number of FTE teachers.", "STU_EXP: Total salary expenditures for all personnel plus total non-salary expenditures divided by the total number of enrolled students.", sep="\n\n")
## RATE_BULLY: For ethnicities, the number of students reported as harassed or bullied on the basis of race, color or national origin divided by the total number of enrolled students. For genders, the number of students reported as harassed or bullied on the basis of sex divided by the total number of enrolled students.
## 
## RATE_ISS: The number of students without disabilities who received one or more in-school suspensions divided by the total number of enrolled students.
## 
## RATE_ABSENT: The number of students absent 15 or more days in the school year divided by the total number of enrolled students.
## 
## RATE_LEP: The number of students who have limited English proficiency divided by the total number of enrolled students.
## 
## SAL_PER_TEACH: Total salary expenditures for all teachers divided by the total FTE of teachers.
## 
## TEA_ABSENT: Number of FTE teachers who were absent more than 10 school days during the school year divided by the number of FTE teachers.
## 
## STU_TEA_RATIO: Total number of enrolled students divided by the number of FTE teachers.
## 
## STU_EXP: Total salary expenditures for all personnel plus total non-salary expenditures divided by the total number of enrolled students.

Create histograms of each combination of subgroups and indicators to show on the app:

## function to create plots

my_plot <- function(hist1, var1, color1, hist2, var2, color2,x) 
{
p <- plot_ly(alpha = 0.6) %>% 
     add_histogram(name = hist1,x =var1, color = I(color1)) %>% 
     add_histogram( name = hist2, x =var2,color = I(color2)) %>% 
     layout(barmode = "overlay", xaxis = list(title = x ), yaxis = list(title = "No. Of Schools") )
return(p)  
}


## absentiesm rates for different ethnicities
abs_WH_BL <- my_plot("White", PA_abs_rates$RATE_ABSENT_WH,"purple", "Black", PA_abs_rates$RATE_ABSENT_BL,"yellow","Absenteeism Rates")

abs_WH_HI <- my_plot("White", PA_abs_rates$RATE_ABSENT_WH,"purple", "Hispanic", PA_abs_rates$RATE_ABSENT_HI,"blue","Absenteeism Rates")

abs_WH_AS <- my_plot("White", PA_abs_rates$RATE_ABSENT_WH,"purple", "Asian", PA_abs_rates$RATE_ABSENT_AS,"green","Absenteeism Rates")

abs_BL_HI <- my_plot("Black", PA_abs_rates$RATE_ABSENT_BL,"yellow", "Hispanic", PA_abs_rates$RATE_ABSENT_HI,"blue","Absenteeism Rates")

abs_AS_HI <- my_plot("Asian", PA_abs_rates$RATE_ABSENT_AS,"green", "Hispanic", PA_abs_rates$RATE_ABSENT_HI,"blue","Absenteeism Rates")

abs_BL_AS <- my_plot("Black", PA_abs_rates$RATE_ABSENT_BL,"yellow", "Asian", PA_abs_rates$RATE_ABSENT_AS,"green","Absenteeism Rates")

abs_WH_HI
## Warning: Ignoring 90 observations
## Warning: Ignoring 343 observations
abs_WH_AS
## Warning: Ignoring 90 observations
## Warning: Ignoring 1056 observations
abs_WH_BL
## Warning: Ignoring 90 observations
## Warning: Ignoring 358 observations
abs_BL_AS
## Warning: Ignoring 358 observations
## Warning: Ignoring 1056 observations
abs_BL_HI
## Warning: Ignoring 358 observations
## Warning: Ignoring 343 observations
abs_AS_HI
## Warning: Ignoring 1056 observations

## Warning: Ignoring 343 observations
## in school suspension (ISS) rates

iss_WH_BL <- my_plot("White", PA_iss_rates$RATE_ISS_WH,"purple", "Black", PA_iss_rates$RATE_ISS_BL,"yellow","In school Suspension(ISS) Rates")

iss_WH_HI <- my_plot("White", PA_iss_rates$RATE_ISS_WH,"purple", "Hispanic", PA_iss_rates$RATE_ISS_HI,"blue","In school Suspension(ISS) Rates")

iss_WH_AS <- my_plot("White", PA_iss_rates$RATE_ISS_WH,"purple", "Asian", PA_iss_rates$RATE_ISS_AS,"green","In school Suspension(ISS) Rates")

iss_BL_HI <- my_plot("Black", PA_iss_rates$RATE_ISS_BL,"yellow", "Hispanic", PA_iss_rates$RATE_ISS_HI,"blue","In school Suspension(ISS) Rates")

iss_AS_HI <- my_plot("Asian", PA_iss_rates$RATE_ISS_AS,"green", "Hispanic", PA_iss_rates$RATE_ISS_HI,"blue","In school Suspension(ISS) Rates")

iss_BL_AS <- my_plot("Black", PA_iss_rates$RATE_ISS_BL,"yellow", "Asian", PA_iss_rates$RATE_ISS_AS,"green","In school Suspension(ISS) Rates")

iss_WH_AS
## Warning: Ignoring 89 observations
## Warning: Ignoring 1062 observations
iss_WH_HI
## Warning: Ignoring 89 observations
## Warning: Ignoring 349 observations
iss_WH_BL
## Warning: Ignoring 89 observations
## Warning: Ignoring 359 observations
iss_BL_AS
## Warning: Ignoring 359 observations
## Warning: Ignoring 1062 observations
iss_BL_HI
## Warning: Ignoring 359 observations
## Warning: Ignoring 349 observations
iss_AS_HI
## Warning: Ignoring 1062 observations

## Warning: Ignoring 349 observations
## bullying rates for different ethnicities

bully_WH_BL <- my_plot("White", PA_bully_rates$RATE_BULLY_WH,"purple", "Black", PA_bully_rates$RATE_BULLY_BL,"yellow","Bullying Rates")

bully_WH_HI <- my_plot("White", PA_bully_rates$RATE_BULLY_WH,"purple", "Hispanic", PA_bully_rates$RATE_BULLY_HI,"blue","Bullying Rates")

bully_WH_AS <- my_plot("White", PA_bully_rates$RATE_BULLY_WH,"purple", "Asian", PA_bully_rates$RATE_BULLY_AS,"green","Bullying Rates")

bully_BL_HI <- my_plot("Black", PA_bully_rates$RATE_BULLY_BL,"yellow", "Hispanic", PA_bully_rates$RATE_BULLY_HI,"blue","Bullying Rates")

bully_AS_HI <- my_plot("Asian", PA_bully_rates$RATE_BULLY_AS,"green", "Hispanic", PA_bully_rates$RATE_BULLY_HI,"blue","Bullying Rates")

bully_BL_AS <- my_plot("Black", PA_bully_rates$RATE_BULLY_BL,"yellow", "Asian", PA_bully_rates$RATE_BULLY_AS,"green","Bullying Rates")

bully_WH_AS
## Warning: Ignoring 89 observations
## Warning: Ignoring 1064 observations
bully_WH_BL
## Warning: Ignoring 89 observations
## Warning: Ignoring 359 observations
bully_WH_HI
## Warning: Ignoring 89 observations
## Warning: Ignoring 349 observations
bully_BL_AS
## Warning: Ignoring 359 observations
## Warning: Ignoring 1064 observations
bully_BL_HI
## Warning: Ignoring 359 observations
## Warning: Ignoring 349 observations
bully_AS_HI
## Warning: Ignoring 1064 observations

## Warning: Ignoring 349 observations
# male and female rates for each indicator

bully_M_F <- my_plot("Male", PA_bully_rates$RATE_BULLY_M,"black", "Female", PA_bully_rates$RATE_BULLY_F,"red","Bullying Rates")

iss_M_F <- my_plot("Male", PA_iss_rates$RATE_ISS_M,"black", "Female", PA_iss_rates$RATE_ISS_F,"red","In School Suspension (ISS) Rates")

abs_M_F <- my_plot("Male", PA_abs_rates$RATE_ABSENT_M,"black", "Female", PA_abs_rates$RATE_ABSENT_F,"red","Absenteeism Rates")

bully_M_F
## Warning: Ignoring 1 observations
## Warning: Ignoring 8 observations
iss_M_F
## Warning: Ignoring 1 observations
## Warning: Ignoring 6 observations
abs_M_F
## Warning: Ignoring 1 observations
## Warning: Ignoring 8 observations